In [16]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
In [17]:
df = pd.read_csv("Data.csv")
In [18]:
df
Out[18]:
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | Yes |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | NaN | 30.0 | 54000.0 | NaN |
3 | Spain | 38.0 | 61000.0 | No |
4 | Germany | 40.0 | NaN | Yes |
5 | France | 35.0 | 58000.0 | Yes |
6 | Spain | NaN | 52000.0 | No |
7 | France | 48.0 | 79000.0 | Yes |
8 | Germany | 50.0 | 83000.0 | No |
9 | France | 37.0 | 67000.0 | Yes |
In [20]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer.fit(df.iloc[:,:].values)
df.iloc[:,:] = imputer.transform(df.iloc[:,:].values)
In [ ]:
df
Out[ ]:
array([['France', 44.0, 72000.0, 'Yes'], ['Spain', 27.0, 48000.0, 'Yes'], ['France', 30.0, 54000.0, 'Yes'], ['Spain', 38.0, 61000.0, 'No'], ['Germany', 40.0, 48000.0, 'Yes'], ['France', 35.0, 58000.0, 'Yes'], ['Spain', 27.0, 52000.0, 'No'], ['France', 48.0, 79000.0, 'Yes'], ['Germany', 50.0, 83000.0, 'No'], ['France', 37.0, 67000.0, 'Yes']], dtype=object)
In [ ]: